Preparation

##### Load required R libraries
library(Seurat)
require(scales)
require(ggplot2)
require(viridis)
require(dplyr)
require(GeneTrajectory)
require(Matrix)
require(plot3D)
library(openxlsx)

Loading preprocessed data

setwd("/data/sci-fate_analysis/data/")

data_S <- readRDS("/data/sci-fate_analysis/data/data_S_scifate_CC_GR.rds")
DimPlot(data_S, group.by = "treatment_time", reduction = "umap.orig")

DimPlot(data_S, group.by = "treatment_time", reduction = "umap.new.GR")

TF_gene_summary <- read.xlsx("../41587_2020_480_MOESM3_ESM.xlsx", sheet = 2)
colnames(TF_gene_summary) <- TF_gene_summary[1,]
TF_gene_summary <- TF_gene_summary[-1,]
head(TF_gene_summary)
##   TF linked_gene     linked_gene_id     TF_link Regression coefficient
## 2 AR       LIN7A  ENSG00000111052.3    AR_LIN7A  5.9163964032391797E-2
## 3 AR        GPC6  ENSG00000183098.6     AR_GPC6  4.7126417860035602E-2
## 4 AR       BAMBI  ENSG00000095739.7    AR_BAMBI  4.7107661847319897E-2
## 5 AR        SYT1  ENSG00000067715.9     AR_SYT1  4.2513331324419798E-2
## 6 AR    ARHGAP24 ENSG00000138639.13 AR_ARHGAP24    4.20068808011929E-2
## 7 AR       RBMS3 ENSG00000144642.16    AR_RBMS3  4.1794736793863203E-2
##                                        group
## 2 Activating links by newly synthesized mRNA
## 3 Activating links by newly synthesized mRNA
## 4 Activating links by newly synthesized mRNA
## 5 Activating links by newly synthesized mRNA
## 6 Activating links by newly synthesized mRNA
## 7 Activating links by newly synthesized mRNA
length(unique(TF_gene_summary$TF))
## [1] 60
CC_TFs <- c("E2F1","E2F2","E2F7","EZH2","BRCA1","E2F8","FOXM1","MYBL2")
GR_TFs <- c("BCL6", "FOXO1", "CEBPB", "GTF2IRD1", "JUNB", "RARB", "THRA")
CC_genes <- unique(TF_gene_summary$linked_gene_id[which(TF_gene_summary$TF %in% CC_TFs)])
GR_genes <- unique(TF_gene_summary$linked_gene_id[which(TF_gene_summary$TF %in% GR_TFs)])
all_genes <- unique(c(CC_genes, GR_genes))
length(all_genes)
## [1] 674

Gene-gene distance computation

Prepare the input for gene-gene Wasserstein distance computation
data_S <- GeneTrajectory::RunDM(data_S, reduction = "pca.joint", dims = 1:20)
cell.graph.dist <- GetGraphDistance(data_S, K = 5)
## Constructing kNN graph
## Constructing graph distance matrix
## The largest graph distance is 46
genes <- all_genes
cg_output <- CoarseGrain(data_S, cell.graph.dist, genes, N = 1000, assay = "RNA_new")
## Run k-means clustering
## Coarse-grain matrices
#remove the gene with zero count
which(apply(cg_output[["gene.expression"]], 2, sum) == 0)
## ENSG00000268942.1 
##               429
#####Save the files for computation in Python
dir.path <- "/banach1/rq25/tmp/sci-fate_example/"
dir.create(dir.path)
write.table(cg_output[["features"]][-which(apply(cg_output[["gene.expression"]], 2, sum) == 0)], paste0(dir.path, "gene_names.csv"), row.names = F, col.names = F, sep = ",")
write.table(cg_output[["graph.dist"]], paste0(dir.path, "ot_cost.csv"), row.names = F, col.names = F, sep = ",")
Matrix::writeMM(Matrix(cg_output[["gene.expression"]][,-which(apply(cg_output[["gene.expression"]], 2, sum) == 0)], sparse = T), paste0(dir.path, "gene_expression.mtx"))
## NULL

Wasserstein distance computation in Python

# Make sure to install the latest version of POT module (python), using the following:
# pip install -U https://github.com/PythonOT/POT/archive/master.zip
# Run the following command to get the gene-gene Wasserstein distance matrix

#system(sprintf("nohup /data/rihao/anaconda3/bin/python /data/rihao/GeneTrajectory/GeneTrajectory/python/gene_distance_cal_parallel.py %s &", dir.path))

Gene trajectory inference and visualization

setwd("/data/sci-fate_analysis/data/")
dir.path <- "./GT_CC_GR_genes_DM5_K5_N1000_RNAnew/"
gene.dist.mat.all <- LoadGeneDistMat(dir.path, file_name = "emd.csv")
gene.dist.mat <- gene.dist.mat.all
gene_embedding <- GetGeneEmbedding(gene.dist.mat, K = 5)$diffu.emb

colvar <- rep(0, nrow(gene_embedding))
colvar[which(rownames(gene_embedding) %in% CC_genes)] <- 1
scatter3D(gene_embedding[,1],
          gene_embedding[,2],
          gene_embedding[,3],
          bty = "b2", colvar = colvar, 
          main = "trajectory", pch = 19, cex = 1, theta = 45, phi = 0,
          col = ramp.col(c(hue_pal()(3))))

Visualize gene bin plots

##### Focus on genes expressed in 10%-75% cells
assay <- "RNA_new"
DefaultAssay(data_S) <- assay
all_genes <- unique(c(CC_genes, GR_genes))
expr_percent <- apply(as.matrix(data_S[[assay]]$data[all_genes, ]) > 0, 1, sum)/ncol(data_S)
genes <- all_genes[which(expr_percent > 0.1 & expr_percent < 0.75)]
length(genes)
## [1] 618
gene.dist.mat.all <- gene.dist.mat.all[genes, genes]

##### Define CC gene ordering
gene.dist.mat <- gene.dist.mat.all[which(rownames(gene.dist.mat.all) %in% c(CC_genes)), 
                                   which(rownames(gene.dist.mat.all) %in% c(CC_genes))]
gene_embedding <- GetGeneEmbedding(gene.dist.mat, K = 5)$diffu.emb
gene_trajectory <- ExtractGeneTrajectory(gene_embedding, gene.dist.mat, N = 1, t.list = c(1000), K = 5)
## ENSG00000187837.2
gene_trajectory_CC_RNA_new <- gene_trajectory

##### Define GR gene ordering
gene.dist.mat <- gene.dist.mat.all[which(rownames(gene.dist.mat.all) %in% c(GR_genes)), 
                                   which(rownames(gene.dist.mat.all) %in% c(GR_genes))]
gene_embedding <- GetGeneEmbedding(gene.dist.mat, K = 5)$diffu.emb
gene_trajectory <- ExtractGeneTrajectory(gene_embedding, gene.dist.mat, N = 1, t.list = c(1000), K = 5)
## ENSG00000163535.13
gene_trajectory_GR_RNA_new <- gene_trajectory


Nbin <- 7
##### Visualize CC gene plots
gene_trajectory_CC_RNA_new -> gene_trajectory
data_S <- AddGeneBinScore(data_S, gene_trajectory, N.bin = Nbin, trajectories = 1:1, assay = "RNA_new")
FeaturePlot(data_S, pt.size = 0.05, features = paste0("Trajectory",1,"_genes", 1:Nbin), ncol = Nbin, order = F, reduction = "umap.orig") &
  scale_color_gradientn(colors = rev(brewer_pal(palette = "RdYlBu")(10))) & NoLegend() & NoAxes()
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.

##### Visualize GR gene plots
gene_trajectory_GR_RNA_new -> gene_trajectory
data_S <- AddGeneBinScore(data_S, gene_trajectory, N.bin = Nbin, trajectories = 1:1, reverse = T, assay = "RNA_new")
FeaturePlot(data_S, pt.size = 0.05, features = paste0("Trajectory",1,"_genes", 1:Nbin), ncol = Nbin, order = F, reduction = "umap.new.GR") &
  scale_color_gradientn(colors = rev(brewer_pal(palette = "RdYlBu")(10))) & NoLegend() & NoAxes()
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.

Visualize gene distribution plots for CC process

##### CC gene distribution plots

gene_trajectory_CC_RNA_new -> gene_trajectory
gene_trajectory <- gene_trajectory[order(gene_trajectory$Pseudoorder1),]

CC_TFs <- c("E2F1","E2F2","E2F7","EZH2","BRCA1","E2F8","FOXM1","MYBL2")
CC_TF_gene_lists <- list()
for (i in 1:length(CC_TFs)){
  CC_TF_gene_lists[[CC_TFs[i]]] <- unique(TF_gene_summary$linked_gene_id[which(TF_gene_summary$TF == CC_TFs[i])])
}

freq_table <- table(do.call(c,CC_TF_gene_lists))
freq = as.integer(freq_table)
names(freq) = names(freq_table)

ggplot_list <- list()
for (i in 1:length(CC_TFs)){
  gene_set <- unique(TF_gene_summary$linked_gene_id[which(TF_gene_summary$TF == CC_TFs[i])])
  
  CC_genes_df <- data.frame(genes = rownames(gene_trajectory),
                            group = NA)
  CC_genes_df$freq <- freq[rownames(gene_trajectory)]
  CC_genes_df$group[which(CC_genes_df$genes %in% gene_set)] = CC_TFs[i]
  CC_genes_df$index <- 1:nrow(CC_genes_df)
  
  ggplot_list[[i]] <- 
    ggplot(CC_genes_df %>% filter(!is.na(group) & freq <= 100), aes(x = index, fill = group)) +
    geom_density(linetype = 1, alpha = 0.75) + xlim(c(1, nrow(gene_trajectory)))  +
    scale_fill_manual(values = c("darkblue", "orange", "forestgreen"))+ #NoLegend()+
    theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
          panel.background = element_blank(), axis.line = element_line(colour = "black")) +
    ggtitle(CC_TFs[i])
}

print(ggpubr::ggarrange(plotlist = ggplot_list, 
                        ncol = 4, nrow = 2, align = "v"))

Visualize gene distribution plots for CC process: E2F1, BRCA1, FOXM1

CC_TFs2 <- c("E2F1","BRCA1","FOXM1")
CC_TF_gene_lists <- list()
for (i in 1:length(CC_TFs2)){
  CC_TF_gene_lists[[CC_TFs2[i]]] <- unique(TF_gene_summary$linked_gene_id[which(TF_gene_summary$TF == CC_TFs2[i])])
}
freq_table <- table(do.call(c,CC_TF_gene_lists))
freq = as.integer(freq_table)
names(freq) = names(freq_table)

CC_genes_df <- data.frame(genes = rownames(gene_trajectory),
                          group = NA)
CC_genes_df$freq <- freq[rownames(gene_trajectory)]
gene_set <- unique(TF_gene_summary$linked_gene_id[which(TF_gene_summary$TF == CC_TFs2[1])])
CC_genes_df$group[which(CC_genes_df$genes %in% gene_set)] = CC_TFs2[1]
gene_set <- unique(TF_gene_summary$linked_gene_id[which(TF_gene_summary$TF == CC_TFs2[2])])
CC_genes_df$group[which(CC_genes_df$genes %in% gene_set)] = CC_TFs2[2]
gene_set <- unique(TF_gene_summary$linked_gene_id[which(TF_gene_summary$TF == CC_TFs2[3])])
CC_genes_df$group[which(CC_genes_df$genes %in% gene_set)] = CC_TFs2[3]
CC_genes_df$index <- 1:nrow(CC_genes_df)

p <- 
  ggplot(CC_genes_df %>% filter(!is.na(group) & freq ==1), aes(x = index, fill = group)) +
  geom_density(linetype = 1, alpha = 0.75) + xlim(c(1, nrow(gene_trajectory)))  +
  scale_fill_manual(values = c("purple", "orange", "forestgreen"))+ 
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black"))
p

Visualize gene distribution plots for GR process

##### GR gene distribution plots

gene_trajectory_GR_RNA_new -> gene_trajectory
gene_trajectory <- gene_trajectory[order(gene_trajectory$Pseudoorder1),]

GR_TFs <- c("BCL6", "FOXO1", "CEBPB", "GTF2IRD1", "JUNB", "RARB", "THRA")

GR_TF_gene_lists <- list()
for (i in 1:length(GR_TFs)){
  GR_TF_gene_lists[[GR_TFs[i]]] <- unique(TF_gene_summary$linked_gene_id[which(TF_gene_summary$TF == GR_TFs[i])])
}
freq_table <- table(do.call(c,GR_TF_gene_lists))
freq = as.integer(freq_table)
names(freq) = names(freq_table)

ggplot_list <- list()
for (i in 1:length(GR_TFs)){
  gene_set <- unique(TF_gene_summary$linked_gene_id[which(TF_gene_summary$TF == GR_TFs[i])])
  
  GR_genes_df <- data.frame(genes = rownames(gene_trajectory),
                            group = NA)
  GR_genes_df$freq <- freq[rownames(gene_trajectory)]
  GR_genes_df$group[which(GR_genes_df$genes %in% gene_set)] = GR_TFs[i]
  GR_genes_df$index <- 1:nrow(GR_genes_df)
  
  ggplot_list[[i]] <- 
    ggplot(GR_genes_df %>% filter(!is.na(group) & freq <= 100), aes(x = index, fill = group)) +
    geom_density(linetype = 1, alpha = 0.75) + xlim(c(1, nrow(gene_trajectory)))  +
    scale_fill_manual(values = c("darkblue", "orange", "forestgreen"))+ 
    theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
          panel.background = element_blank(), axis.line = element_line(colour = "black")) +
    ggtitle(GR_TFs[i])
}
print(ggpubr::ggarrange(plotlist = ggplot_list, 
                        ncol = 4, nrow = 2, align = "v"))